{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 01 Identifying individuals, variables and categorical variables in a data set"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<iframe width=\"700\" height=\"400\" src=\"https://www.youtube.com/embed/EqeVXI4WNHM/\" frameborder=\"0\" allowfullscreen></iframe>\n"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "%%html\n",
    "<iframe width=\"700\" height=\"400\" src=\"https://www.youtube.com/embed/EqeVXI4WNHM/\" frameborder=\"0\" allowfullscreen></iframe>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import plotly.graph_objects as go\n",
    "import seaborn as sns\n",
    "from sklearn import preprocessing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-streaming-kafka-0-10_2.12:3.1.3,org.apache.spark:spark-sql-kafka-0-10_2.12:3.1.3 pyspark-shell'\n",
    "\n",
    "import findspark\n",
    "\n",
    "findspark.init()\n",
    "from pyspark.context import SparkContext\n",
    "from pyspark.ml.feature import OneHotEncoder, StringIndexer\n",
    "from pyspark.sql import functions as F\n",
    "from pyspark.sql.types import *\n",
    "from pyspark.sql.session import SparkSession\n",
    "\n",
    "spark = SparkSession.builder.appName(\"statistics\").master(\"local\").getOrCreate()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import uuid\n",
    "from confluent_kafka import Producer\n",
    "from ksql import KSQLAPI\n",
    "bootstrap_servers='[::1]:9092'\n",
    "topic=f'stats0101{str(uuid.uuid4())[:7]}'\n",
    "msg_count=5\n",
    "client = KSQLAPI(url='http://localhost:8088')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "[khanacademy](https://www.khanacademy.org/math/ap-statistics/analyzing-categorical-ap/analyzing-one-categorical-variable/v/identifying-individuals-variables-and-categorical-variables-in-a-data-set?modal=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "![Identifying individuals, variables and categorical variables in a data set fig 1](./imgs/01-01-01.png)![Identifying individuals, variables and categorical variables in a data set fig 2](./imgs/01-01-02.png)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset = {\n",
    "    \"Drink\": [\n",
    "        \"Brewed coffee\",\n",
    "        \"Caffe latte\",\n",
    "        \"Caffe mocha\",\n",
    "        \"Cappuccino\",\n",
    "        \"Iced brewed coffee\",\n",
    "        \"Chai latte\",\n",
    "    ],\n",
    "    \"Type\": [\"Hot\", \"Hot\", \"Hot\", \"Hot\", \"Cold\", \"Hot\"],\n",
    "    \"Calories\": [4, 100, 170, 60, 60, 120],\n",
    "    \"Sugars (g)\": [0, 14, 27, 8, 15, 25],\n",
    "    \"Caffein (mg)\": [260, 75, 95, 75, 120, 60],\n",
    "}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Pandas Dataframe"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Type</th>\n",
       "      <th>Calories</th>\n",
       "      <th>Sugars (g)</th>\n",
       "      <th>Caffein (mg)</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Drink</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>Brewed coffee</th>\n",
       "      <td>Hot</td>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "      <td>260</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Caffe latte</th>\n",
       "      <td>Hot</td>\n",
       "      <td>100</td>\n",
       "      <td>14</td>\n",
       "      <td>75</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Caffe mocha</th>\n",
       "      <td>Hot</td>\n",
       "      <td>170</td>\n",
       "      <td>27</td>\n",
       "      <td>95</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Cappuccino</th>\n",
       "      <td>Hot</td>\n",
       "      <td>60</td>\n",
       "      <td>8</td>\n",
       "      <td>75</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Iced brewed coffee</th>\n",
       "      <td>Cold</td>\n",
       "      <td>60</td>\n",
       "      <td>15</td>\n",
       "      <td>120</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Chai latte</th>\n",
       "      <td>Hot</td>\n",
       "      <td>120</td>\n",
       "      <td>25</td>\n",
       "      <td>60</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                    Type  Calories  Sugars (g)  Caffein (mg)\n",
       "Drink                                                       \n",
       "Brewed coffee        Hot         4           0           260\n",
       "Caffe latte          Hot       100          14            75\n",
       "Caffe mocha          Hot       170          27            95\n",
       "Cappuccino           Hot        60           8            75\n",
       "Iced brewed coffee  Cold        60          15           120\n",
       "Chai latte           Hot       120          25            60"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.DataFrame(dataset).set_index(\"Drink\")\n",
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Spark Dataframe"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+------------------+----+--------+----------+------------+\n",
      "|             Drink|Type|Calories|Sugars (g)|Caffein (mg)|\n",
      "+------------------+----+--------+----------+------------+\n",
      "|     Brewed coffee| Hot|       4|         0|         260|\n",
      "|       Caffe latte| Hot|     100|        14|          75|\n",
      "|       Caffe mocha| Hot|     170|        27|          95|\n",
      "|        Cappuccino| Hot|      60|         8|          75|\n",
      "|Iced brewed coffee|Cold|      60|        15|         120|\n",
      "|        Chai latte| Hot|     120|        25|          60|\n",
      "+------------------+----+--------+----------+------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "sdf = spark.createDataFrame(zip(*dataset.values()), schema=list(dataset.keys()))\n",
    "sdf.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Kafka producer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "p = Producer({'bootstrap.servers': bootstrap_servers})\n",
    "for i in zip(*dataset.values()):\n",
    "    x = dict(zip(dataset.keys(), i))\n",
    "    record_key = str(uuid.uuid4())\n",
    "    record_value = json.dumps(x)\n",
    "\n",
    "    p.produce(topic, value=record_value)\n",
    "    p.poll(0)\n",
    "    \n",
    "p.flush()    "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### KSQL stream"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[{\"header\":{\"queryId\":\"transient_STATS0101E908401_8697743092071727740\",\"schema\":\"`Drink` STRING, `Type` STRING, `Calories` BIGINT, `Sugars (g)` BIGINT, `Caffein (mg)` BIGINT\"}},\n",
      "\n",
      "{\"row\":{\"columns\":[\"Brewed coffee\",\"Hot\",4,0,260]}},\n",
      "\n",
      "{\"row\":{\"columns\":[\"Caffe latte\",\"Hot\",100,14,75]}},\n",
      "\n",
      "{\"row\":{\"columns\":[\"Caffe mocha\",\"Hot\",170,27,95]}},\n",
      "\n",
      "{\"row\":{\"columns\":[\"Cappuccino\",\"Hot\",60,8,75]}},\n",
      "\n",
      "{\"row\":{\"columns\":[\"Iced brewed coffee\",\"Cold\",60,15,120]}},\n",
      "\n",
      "{\"row\":{\"columns\":[\"Chai latte\",\"Hot\",120,25,60]}},\n",
      "\n",
      "]\n",
      "\n",
      "\n"
     ]
    }
   ],
   "source": [
    "client.create_stream(table_name=topic,\n",
    "                     columns_type=['`Drink` string','`Type` string', '`Calories` bigint','`Sugars (g)` bigint', '`Caffein (mg)` bigint'],\n",
    "                     topic=topic,\n",
    "                     value_format='JSON'\n",
    "                    )\n",
    "res = client.query(f\"select * from {topic}\")\n",
    "while True:\n",
    "    try:\n",
    "        print(next(res))\n",
    "    except RuntimeError:\n",
    "        print('')\n",
    "        break"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Spark stream"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### read stream"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- key: binary (nullable = true)\n",
      " |-- value: binary (nullable = true)\n",
      " |-- topic: string (nullable = true)\n",
      " |-- partition: integer (nullable = true)\n",
      " |-- offset: long (nullable = true)\n",
      " |-- timestamp: timestamp (nullable = true)\n",
      " |-- timestampType: integer (nullable = true)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df_raw = spark \\\n",
    "  .readStream \\\n",
    "  .format('kafka') \\\n",
    "  .option('kafka.bootstrap.servers', bootstrap_servers) \\\n",
    "  .option(\"startingOffsets\", \"earliest\") \\\n",
    "  .option('subscribe', topic) \\\n",
    "  .load()\n",
    "df_raw.printSchema()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### write stream"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "df_json = df_raw.selectExpr('CAST(value AS STRING) as json')\n",
    "df_json.select(F.from_json(df_json.json, sdf.schema).alias('stream_data')) \\\n",
    "  .writeStream \\\n",
    "  .queryName(topic) \\\n",
    "  .trigger(once=True) \\\n",
    "  .format(\"memory\") \\\n",
    "  .start() \\\n",
    "  .awaitTermination()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+---------------------------------------+\n",
      "|stream_data                            |\n",
      "+---------------------------------------+\n",
      "|{Brewed coffee, Hot, 4, 0, 260}        |\n",
      "|{Caffe latte, Hot, 100, 14, 75}        |\n",
      "|{Caffe mocha, Hot, 170, 27, 95}        |\n",
      "|{Cappuccino, Hot, 60, 8, 75}           |\n",
      "|{Iced brewed coffee, Cold, 60, 15, 120}|\n",
      "|{Chai latte, Hot, 120, 25, 60}         |\n",
      "+---------------------------------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "spark.sql(f\"select * from {topic}\").show(truncate=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "## Find and Replace"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Panads"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Type</th>\n",
       "      <th>Calories</th>\n",
       "      <th>Sugars (g)</th>\n",
       "      <th>Caffein (mg)</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Drink</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>Brewed coffee</th>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "      <td>260</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Caffe latte</th>\n",
       "      <td>1</td>\n",
       "      <td>100</td>\n",
       "      <td>14</td>\n",
       "      <td>75</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Caffe mocha</th>\n",
       "      <td>1</td>\n",
       "      <td>170</td>\n",
       "      <td>27</td>\n",
       "      <td>95</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Cappuccino</th>\n",
       "      <td>1</td>\n",
       "      <td>60</td>\n",
       "      <td>8</td>\n",
       "      <td>75</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Iced brewed coffee</th>\n",
       "      <td>0</td>\n",
       "      <td>60</td>\n",
       "      <td>15</td>\n",
       "      <td>120</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Chai latte</th>\n",
       "      <td>1</td>\n",
       "      <td>120</td>\n",
       "      <td>25</td>\n",
       "      <td>60</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                    Type  Calories  Sugars (g)  Caffein (mg)\n",
       "Drink                                                       \n",
       "Brewed coffee          1         4           0           260\n",
       "Caffe latte            1       100          14            75\n",
       "Caffe mocha            1       170          27            95\n",
       "Cappuccino             1        60           8            75\n",
       "Iced brewed coffee     0        60          15           120\n",
       "Chai latte             1       120          25            60"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_fr = df.copy()\n",
    "df_fr.replace({\"Type\": {\"Hot\": 1, \"Cold\": 0}}, inplace=True)\n",
    "df_fr"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Spark"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+------------------+----+--------+----------+------------+\n",
      "|             Drink|Type|Calories|Sugars (g)|Caffein (mg)|\n",
      "+------------------+----+--------+----------+------------+\n",
      "|     Brewed coffee|   1|       4|         0|         260|\n",
      "|       Caffe latte|   1|     100|        14|          75|\n",
      "|       Caffe mocha|   1|     170|        27|          95|\n",
      "|        Cappuccino|   1|      60|         8|          75|\n",
      "|Iced brewed coffee|   0|      60|        15|         120|\n",
      "|        Chai latte|   1|     120|        25|          60|\n",
      "+------------------+----+--------+----------+------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "sdf_fr = sdf.withColumn(\n",
    "    \"Type\", F.when(F.col(\"Type\") == \"Hot\", 1).when(F.col(\"Type\") == \"Cold\", 0)\n",
    ")\n",
    "sdf_fr.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Label Encoding"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Pandas"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Type</th>\n",
       "      <th>Calories</th>\n",
       "      <th>Sugars (g)</th>\n",
       "      <th>Caffein (mg)</th>\n",
       "      <th>Type_Hot</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Drink</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>Brewed coffee</th>\n",
       "      <td>Hot</td>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "      <td>260</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Caffe latte</th>\n",
       "      <td>Hot</td>\n",
       "      <td>100</td>\n",
       "      <td>14</td>\n",
       "      <td>75</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Caffe mocha</th>\n",
       "      <td>Hot</td>\n",
       "      <td>170</td>\n",
       "      <td>27</td>\n",
       "      <td>95</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Cappuccino</th>\n",
       "      <td>Hot</td>\n",
       "      <td>60</td>\n",
       "      <td>8</td>\n",
       "      <td>75</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Iced brewed coffee</th>\n",
       "      <td>Cold</td>\n",
       "      <td>60</td>\n",
       "      <td>15</td>\n",
       "      <td>120</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Chai latte</th>\n",
       "      <td>Hot</td>\n",
       "      <td>120</td>\n",
       "      <td>25</td>\n",
       "      <td>60</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                    Type  Calories  Sugars (g)  Caffein (mg)  Type_Hot\n",
       "Drink                                                                 \n",
       "Brewed coffee        Hot         4           0           260         1\n",
       "Caffe latte          Hot       100          14            75         1\n",
       "Caffe mocha          Hot       170          27            95         1\n",
       "Cappuccino           Hot        60           8            75         1\n",
       "Iced brewed coffee  Cold        60          15           120         0\n",
       "Chai latte           Hot       120          25            60         1"
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_lc = df.copy()\n",
    "df_lc[\"Type_Hot\"] = df_lc[\"Type\"].astype(\"category\").cat.codes\n",
    "df_lc"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Sklearn"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Type</th>\n",
       "      <th>Calories</th>\n",
       "      <th>Sugars (g)</th>\n",
       "      <th>Caffein (mg)</th>\n",
       "      <th>Type_Hot</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Drink</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>Brewed coffee</th>\n",
       "      <td>Hot</td>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "      <td>260</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Caffe latte</th>\n",
       "      <td>Hot</td>\n",
       "      <td>100</td>\n",
       "      <td>14</td>\n",
       "      <td>75</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Caffe mocha</th>\n",
       "      <td>Hot</td>\n",
       "      <td>170</td>\n",
       "      <td>27</td>\n",
       "      <td>95</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Cappuccino</th>\n",
       "      <td>Hot</td>\n",
       "      <td>60</td>\n",
       "      <td>8</td>\n",
       "      <td>75</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Iced brewed coffee</th>\n",
       "      <td>Cold</td>\n",
       "      <td>60</td>\n",
       "      <td>15</td>\n",
       "      <td>120</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Chai latte</th>\n",
       "      <td>Hot</td>\n",
       "      <td>120</td>\n",
       "      <td>25</td>\n",
       "      <td>60</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                    Type  Calories  Sugars (g)  Caffein (mg)  Type_Hot\n",
       "Drink                                                                 \n",
       "Brewed coffee        Hot         4           0           260         1\n",
       "Caffe latte          Hot       100          14            75         1\n",
       "Caffe mocha          Hot       170          27            95         1\n",
       "Cappuccino           Hot        60           8            75         1\n",
       "Iced brewed coffee  Cold        60          15           120         0\n",
       "Chai latte           Hot       120          25            60         1"
      ]
     },
     "execution_count": 80,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_lc = df.copy()\n",
    "df_lc[\"Type_Hot\"] = preprocessing.LabelEncoder().fit_transform(df[\"Type\"])\n",
    "df_lc"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Spark"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+------------------+----+--------+----------+------------+---------+\n",
      "|             Drink|Type|Calories|Sugars (g)|Caffein (mg)|Type_Cold|\n",
      "+------------------+----+--------+----------+------------+---------+\n",
      "|     Brewed coffee| Hot|       4|         0|         260|      0.0|\n",
      "|       Caffe latte| Hot|     100|        14|          75|      0.0|\n",
      "|       Caffe mocha| Hot|     170|        27|          95|      0.0|\n",
      "|        Cappuccino| Hot|      60|         8|          75|      0.0|\n",
      "|Iced brewed coffee|Cold|      60|        15|         120|      1.0|\n",
      "|        Chai latte| Hot|     120|        25|          60|      0.0|\n",
      "+------------------+----+--------+----------+------------+---------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "sdf_lc = StringIndexer(inputCol=\"Type\", outputCol=\"Type_Cold\").fit(sdf).transform(sdf)\n",
    "sdf_lc.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## One-Hot Encoding"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Pandas"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Calories</th>\n",
       "      <th>Sugars (g)</th>\n",
       "      <th>Caffein (mg)</th>\n",
       "      <th>Type_Cold</th>\n",
       "      <th>Type_Hot</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Drink</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>Brewed coffee</th>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "      <td>260</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Caffe latte</th>\n",
       "      <td>100</td>\n",
       "      <td>14</td>\n",
       "      <td>75</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Caffe mocha</th>\n",
       "      <td>170</td>\n",
       "      <td>27</td>\n",
       "      <td>95</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Cappuccino</th>\n",
       "      <td>60</td>\n",
       "      <td>8</td>\n",
       "      <td>75</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Iced brewed coffee</th>\n",
       "      <td>60</td>\n",
       "      <td>15</td>\n",
       "      <td>120</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Chai latte</th>\n",
       "      <td>120</td>\n",
       "      <td>25</td>\n",
       "      <td>60</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                    Calories  Sugars (g)  Caffein (mg)  Type_Cold  Type_Hot\n",
       "Drink                                                                      \n",
       "Brewed coffee              4           0           260          0         1\n",
       "Caffe latte              100          14            75          0         1\n",
       "Caffe mocha              170          27            95          0         1\n",
       "Cappuccino                60           8            75          0         1\n",
       "Iced brewed coffee        60          15           120          1         0\n",
       "Chai latte               120          25            60          0         1"
      ]
     },
     "execution_count": 46,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_ohc = pd.get_dummies(df, columns=[\"Type\"], prefix=\"Type\")\n",
    "df_ohc"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Sklearn"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Calories</th>\n",
       "      <th>Sugars (g)</th>\n",
       "      <th>Caffein (mg)</th>\n",
       "      <th>Type_Cold</th>\n",
       "      <th>Type_Hot</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Drink</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>Brewed coffee</th>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "      <td>260</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Caffe latte</th>\n",
       "      <td>100</td>\n",
       "      <td>14</td>\n",
       "      <td>75</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Caffe mocha</th>\n",
       "      <td>170</td>\n",
       "      <td>27</td>\n",
       "      <td>95</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Cappuccino</th>\n",
       "      <td>60</td>\n",
       "      <td>8</td>\n",
       "      <td>75</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Iced brewed coffee</th>\n",
       "      <td>60</td>\n",
       "      <td>15</td>\n",
       "      <td>120</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Chai latte</th>\n",
       "      <td>120</td>\n",
       "      <td>25</td>\n",
       "      <td>60</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                    Calories  Sugars (g)  Caffein (mg)  Type_Cold  Type_Hot\n",
       "Drink                                                                      \n",
       "Brewed coffee              4           0           260        0.0       1.0\n",
       "Caffe latte              100          14            75        0.0       1.0\n",
       "Caffe mocha              170          27            95        0.0       1.0\n",
       "Cappuccino                60           8            75        0.0       1.0\n",
       "Iced brewed coffee        60          15           120        1.0       0.0\n",
       "Chai latte               120          25            60        0.0       1.0"
      ]
     },
     "execution_count": 82,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x = (\n",
    "    preprocessing.OneHotEncoder()\n",
    "    .fit_transform(df[\"Type\"].values.reshape(-1, 1))\n",
    "    .toarray()\n",
    ")\n",
    "df_ohc = pd.concat(\n",
    "    [\n",
    "        df.drop(columns=\"Type\").reset_index(),\n",
    "        pd.DataFrame(x, columns=[\"Type_Cold\", \"Type_Hot\"]),\n",
    "    ],\n",
    "    axis=1,\n",
    ")\n",
    "df_ohc.set_index(\"Drink\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Spark"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+------------------+----+--------+----------+------------+---------+-------------+\n",
      "|             Drink|Type|Calories|Sugars (g)|Caffein (mg)|Type_Cold|     Type_Vec|\n",
      "+------------------+----+--------+----------+------------+---------+-------------+\n",
      "|     Brewed coffee| Hot|       4|         0|         260|      0.0|(1,[0],[1.0])|\n",
      "|       Caffe latte| Hot|     100|        14|          75|      0.0|(1,[0],[1.0])|\n",
      "|       Caffe mocha| Hot|     170|        27|          95|      0.0|(1,[0],[1.0])|\n",
      "|        Cappuccino| Hot|      60|         8|          75|      0.0|(1,[0],[1.0])|\n",
      "|Iced brewed coffee|Cold|      60|        15|         120|      1.0|    (1,[],[])|\n",
      "|        Chai latte| Hot|     120|        25|          60|      0.0|(1,[0],[1.0])|\n",
      "+------------------+----+--------+----------+------------+---------+-------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "sdf_ohc = (\n",
    "    OneHotEncoder(inputCol=\"Type_Cold\", outputCol=\"Type_Vec\")\n",
    "    .fit(sdf_lc)\n",
    "    .transform(sdf_lc)\n",
    ")\n",
    "sdf_ohc.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.12"
  },
  "toc-autonumbering": true,
  "toc-showcode": false,
  "toc-showmarkdowntxt": true
 },
 "nbformat": 4,
 "nbformat_minor": 4
}